Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import random
import math
import time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator

Loading the global deaths dataset

In [3]:
df = pd.read_csv("time_series_covid19_recovered_global.csv")
df.head()
Out[3]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 7/4/20 7/5/20 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20 7/11/20 7/12/20 7/13/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 19164 19366 20103 20179 20700 20847 20882 21135 21216 21254
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 1637 1657 1702 1744 1791 1832 1875 1881 1946 2014
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 11181 11492 11884 12094 12329 12637 13124 13124 13743 14019
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 800 800 800 800 802 802 803 803 803 803
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 108 108 108 117 117 117 117 118 118 118

5 rows × 178 columns

In [4]:
df1 = df.copy()
df1.drop(["Province/State","Lat","Long"],inplace=True,axis=1)
In [5]:
df1 = df1.melt(id_vars=["Country/Region"], var_name="Date", value_name="Value")
In [6]:
fig = px.line(df1, x="Date", y="Value", title='Recovered Cases over time in the world',color='Country/Region')
fig.show()

Top 5 Countries with Highest Recovered Cases

In [8]:
df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20',ascending=False)[:5]
Out[8]:
Country/Region 7/13/20
23 Brazil 1291251
174 US 1031939
79 India 571460
140 Russia 503168
35 Chile 286556
In [9]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Russia']['Date'], y=df1[df1["Country/Region"] == 'Russia']['Value'],
                    mode='lines',
                    name='Russia'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Chile']['Date'], y=df1[df1["Country/Region"] == 'Chile']['Value'],
                    mode='lines',
                    name='Chile'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Brazil']['Date'], y=df1[df1["Country/Region"] == 'Brazil']['Value'],
                    mode='lines',
                    name='Brazil'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))
fig.update_layout(
    title = "Time Series Analysis of (Date and Recovered Cases) for Countries with Highest Recovered Cases",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Recovered Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:385: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


Top 5 Countries with lowest Number of Recovered Cases

In [10]:
df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20')[:5]
Out[10]:
Country/Region 7/13/20
104 MS Zaandam 0
162 Sweden 0
184 Western Sahara 8
132 Papua New Guinea 8
150 Seychelles 11
In [11]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Sweden']['Date'], y=df1[df1["Country/Region"] == 'Sweden']['Value'],
                    mode='lines',
                    name='Sweden'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'MS Zaandam']['Date'], y=df1[df1["Country/Region"] == 'MS Zaandam']['Value'],
                    mode='lines',
                    name='MS Zaandam'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Seychelles']['Date'], y=df1[df1["Country/Region"] == 'Seychelles']['Value'],
                    mode='lines',
                    name='Seychelles'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Western Sahara']['Date'], y=df1[df1["Country/Region"] == 'Western Sahara']['Value'],
                    mode='lines',
                    name='Western Sahara'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Papua New Guinea']['Date'], y=df1[df1["Country/Region"] == 'Papua New Guinea']['Value'],
                    mode='lines',
                    name='Papua New Guinea'))
fig.update_layout(
    title = "Time Series Analysis of (Date and Recovered Cases) for Countries with Lowest Recovered Cases",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Recovered Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:385: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


Log of Recovered Cases Over time

In [12]:
fig = px.line(df1, x="Date", y="Value", title='Log of recovered cases over time for all the countries',color='Country/Region')
fig.update_layout(yaxis_type="log",
                 yaxis = dict(title_text = "log(Recovered Cases)"))
fig.show()
In [13]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Russia']['Date'], y=df1[df1["Country/Region"] == 'Russia']['Value'],
                    mode='lines',
                    name='Russia'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Chile']['Date'], y=df1[df1["Country/Region"] == 'Chile']['Value'],
                    mode='lines',
                    name='Chile'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Brazil']['Date'], y=df1[df1["Country/Region"] == 'Brazil']['Value'],
                    mode='lines',
                    name='Brazil'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))
fig.update_layout(
    title = "Log of Recovered Cases over time for top 5 countries",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Recovered Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()
In [ ]:
 

Country Specific Graphs

Distribution of Recovered Cases in U.S.

In [13]:
f,ax = plt.subplots(figsize=(15,6))
sns.distplot(df1[df1["Country/Region"] == "US"]["Value"])
plt.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Recovered Cases in US

In [17]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.update_layout(
    title = "Recovered cases in US",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Recovered Cases",
        title_font = {"size": 20},
        title_standoff = 25))

Log of Recovered Cases in US

In [19]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.update_layout(
    title = "Log of Recovered cases in US",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Recovered Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()

Recovered Cases in India

In [24]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))
fig.update_layout(
    title = "Recovered Cases in India",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Recovered Cases",
        title_font = {"size": 20},
        title_standoff = 25))

Log of Recovered Cases in India

In [23]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))
fig.update_layout(
    title = "Log of Recovered Cases in India",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Recovered Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()

Recovered Cases in Spain

In [27]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Spain']['Date'], y=df1[df1["Country/Region"] == 'Spain']['Value'],
                    mode='lines',
                    name='Spain'))
fig.update_layout(
    title = "Recovered Cases in Spain",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Recovered Cases",
        title_font = {"size": 20},
        title_standoff = 25))

Log of Recoverd Cases in Spain

In [28]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Spain']['Date'], y=df1[df1["Country/Region"] == 'Spain']['Value'],
                    mode='lines',
                    name='Spain'))
fig.update_layout(
    title = "Log of Recovered Cases in Spain",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Recovered Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")

Recovered Cases all over the world

In [29]:
country_tot = df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20',ascending=False)[:20]
In [32]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=country_tot["Country/Region"],
    x= country_tot["7/13/20"],
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=2)
    )
))
fig.update_layout(
    title = "Recovered Cases all over the world",
    xaxis = dict(
        title_text = "Recovered Cases",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Country",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

World Daily Increase in Recovered Cases

In [33]:
world_daily = df1.groupby("Date").sum().reset_index()
In [34]:
fig = go.Figure(go.Bar(
            x=world_daily["Date"],
            y=world_daily["Value"],
            orientation='v'))
fig.update_layout(
    title = "World's increase in Recovered Cases",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Recovered Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

Prediction Part

In [35]:
df.head()
Out[35]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 7/4/20 7/5/20 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20 7/11/20 7/12/20 7/13/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 19164 19366 20103 20179 20700 20847 20882 21135 21216 21254
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 1637 1657 1702 1744 1791 1832 1875 1881 1946 2014
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 11181 11492 11884 12094 12329 12637 13124 13124 13743 14019
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 800 800 800 800 802 802 803 803 803 803
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 108 108 108 117 117 117 117 118 118 118

5 rows × 178 columns

In [36]:
columns = df.keys()
confirmed = df.loc[:, columns[4]:columns[-1]]
In [37]:
dates = confirmed.keys()
world_cases = []

for i in dates:
    confirmed_sum = confirmed[i].sum()
    world_cases.append(confirmed_sum)
In [38]:
days_since_1_22 = np.array([i for i in range(len(dates))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
In [39]:
days_in_future = 15
future_forcast = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
adjusted_dates = future_forcast[:-15]
In [40]:
start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forcast_dates = []
for i in range(len(future_forcast)):
    future_forcast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))
In [41]:
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(days_since_1_22, world_cases, test_size=0.15, shuffle=False) 

Prediction using Linear Regression

In [43]:
linear_model = LinearRegression(normalize=True, fit_intercept=True)
linear_model.fit(X_train_confirmed, y_train_confirmed)
test_linear_pred = linear_model.predict(X_test_confirmed)
linear_pred = linear_model.predict(future_forcast)
print('MAE:', mean_absolute_error(test_linear_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_linear_pred, y_test_confirmed))
MAE: 2667285.101165465
MSE: 7770353945393.55
In [44]:
print(linear_model.coef_)
print(linear_model.intercept_)
[[23054.7657028]]
[-821620.24324324]
In [46]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, world_cases)
plt.plot(future_forcast, linear_pred, linestyle='dashed', color='orange')
plt.title('Number of Covid Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Number of Recovered Cases', size=30)
plt.legend(['Recovered Cases', 'Linear Regression Predictions'])
plt.xticks(size=15)
plt.show()

Future Prediction using Linear Regression

In [47]:
print('Linear regression future predictions:')
print(linear_pred[-15:])
Linear regression future predictions:
[[3189908.98904412]
 [3212963.75474692]
 [3236018.52044972]
 [3259073.28615252]
 [3282128.05185532]
 [3305182.81755812]
 [3328237.58326092]
 [3351292.34896372]
 [3374347.11466653]
 [3397401.88036933]
 [3420456.64607213]
 [3443511.41177493]
 [3466566.17747773]
 [3489620.94318053]
 [3512675.70888333]]

Prediction using Support Vector Machines

In [48]:
svm_confirmed = SVR(shrinking=True, kernel='poly',gamma=0.01, epsilon=1,degree=3, C=0.1)
svm_confirmed.fit(X_train_confirmed, y_train_confirmed)
svm_pred = svm_confirmed.predict(future_forcast)
C:\Users\Saurabh\Anaconda3\lib\site-packages\sklearn\utils\validation.py:752: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

In [50]:
# check against testing data
svm_test_pred = svm_confirmed.predict(X_test_confirmed)
plt.figure(figsize=(15,6))
plt.plot(y_test_confirmed)
plt.plot(svm_test_pred)
plt.legend(['Test Data', 'SVM Predictions'])
print('MAE:', mean_absolute_error(svm_test_pred, y_test_confirmed))
print('MSE:',mean_squared_error(svm_test_pred, y_test_confirmed))
MAE: 671247.3568311642
MSE: 527886020726.4116
In [52]:
x = adjusted_dates
y = world_cases
pred = svm_pred
algo_name = 'SVM Predictions'
color = 'purple'
plt.figure(figsize=(15, 8))
plt.plot(x, y)
plt.plot(future_forcast, pred, linestyle='dashed', color=color)
plt.title('Worldwide Coronavirus Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Number of RecoveredCases', size=30)
plt.legend(['Recovered Cases', algo_name], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
In [ ]: